This project uses renv
to keep track of installed packages. Install renv if not
installed and load dependencies with renv::restore().
install.packages("renv")
renv::restore()
library(readr)
library(dplyr)
library(tidyr)
library(reshape2)
library(GenomicRanges)
library(pheatmap)
library(tibble)
library(ggplot2)
library(stringr)
library(cowplot)
library(markdown)
library(RColorBrewer)
library(GenomicAlignments)
library(reshape2)
samples <- read_tsv("config/samples.tsv", show_col_types = FALSE)
units <- read_tsv("config/units.tsv", show_col_types = FALSE)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
unite(sample_unit, sample_name, unit_name, remove = FALSE)
sample_units
idxstats to get human coverage for
normalizationNotes:
idxstats_exogenousrna_dir <-
"results/samtools_idxstats/exogenous_rna/"
idxstats_human_dir <-
"results/samtools_idxstats/Homo_sapiens.GRCh38.dna.primary_assembly/"
bowtie2_human_logs <-
"results/logs/bowtie2/Homo_sapiens.GRCh38.dna.primary_assembly/"
idxstats <- tibble()
for (row in seq_len(nrow(sample_units))) {
sample <- sample_units[row, ]$sample_unit
# Read `idsxstats` for exogenous mapped reads
exogenous_rna_stats <- read_tsv(
file.path(idxstats_exogenousrna_dir, sprintf("%s.bam.idxstats", sample)),
col_names = c(
"sequence_name", "sequence_length",
"mapped_reads", "unmapped_reads"
),
col_types = "ciii"
)
exogenous_rna_mapped_reads <- exogenous_rna_stats %>%
filter(!sequence_name %in% c("*")) %>%
select(sequence_name, mapped_reads) %>%
mutate(sample = sample)
# Read `idxstats` for human mapped reads
human_stats <- read_tsv(
file.path(idxstats_human_dir, sprintf("%s.bam.idxstats", sample)),
col_names = c(
"sequence_name", "sequence_length",
"mapped_reads", "unmapped_reads"
),
col_types = "ciii"
)
grch38_mapped_reads <- human_stats %>%
filter(!sequence_name %in% c("*")) %>%
select(mapped_reads) %>%
sum()
grch38_mapped_reads <- tibble(
sequence_name = "grch38_mapped_reads",
mapped_reads = grch38_mapped_reads,
sample = sample
)
# Read bowtie2 logs for unmapped reads
bowtie2_log <- readLines(
file.path(bowtie2_human_logs, sprintf("%s.log", sample))
)
total_pairs <- strtoi(str_split(bowtie2_log[1], " ")[[1]][1])
total_reads <- total_pairs * 2
unmapped_reads <- tibble(
sequence_name = "unmapped",
mapped_reads = total_reads - grch38_mapped_reads$mapped_reads,
sample = sample
)
# Consolidate counts for rows
idxstats <- rbind(
idxstats,
exogenous_rna_mapped_reads,
grch38_mapped_reads,
unmapped_reads
)
}
idxstats
bedpe files to get exogenous rna coverage of
paired readsbedpe_data <- tibble()
for (sample in sample_units$sample_unit) {
data <-
readr::read_tsv(
sprintf(
"results/alignments/exogenous_rna/bedpe/%s.bedpe", sample
),
col_names = c(
"chrom1", "chrom1Start", "chrom1End",
"chrom2", "chrom2Start", "chrom2End",
"name", "score", "strand1", "strand2"
),
col_types = "ciiciicicc"
)
bedpe_data <- tibble(rbind(
bedpe_data,
cbind(
sample = sample,
data
)
))
}
bedpe_data
Concordant pairs are pairs of reads that:
--> .. <--)Discordant reads aligned but whose mate:
## Config and function definition
bam_dir <- "results/alignments/exogenous_rna/sorted"
last_day <- 0
cols <- brewer.pal(n = 5, name = "RdBu")
concordant_cell_line_colors <- list(
"Parental" = "#CA0020",
"P1E10" = "#0571B0"
)
discordant_cell_line_colors <- list(
"Parental" = "#F4A582",
"P1E10" = "#92C5DE"
)
# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2")) {
t <- readDNAStringSet(sprintf("data/references/%s.fa", mix))
rna_mixes <- rbind(rna_mixes, tibble(
exogenous_rna = mix,
rna_species = word(t@ranges@NAMES, 1),
length = t@ranges@width
))
}
get_pegrna_plot_data <- function(sequence_name,
normalization_factor,
mix = NA) {
samples <- sample_units %>%
full_join(rna_mixes, by = "exogenous_rna") %>%
arrange(.data$exogenous_rna, day, cell_line) %>%
filter(rna_species == sequence_name)
if (!is.na(mix)) {
samples <- samples %>% filter(.data$exogenous_rna == mix)
}
samples <- samples %>% pull(.data$sample_unit)
plot_data <- list()
for (s in samples) {
if (normalization_factor == "exogenous_rna_mapped_reads") {
norm_seqname <- sequence_name
} else if (normalization_factor == "grch38_mapped_reads") {
norm_seqname <- "grch38_mapped_reads"
}
normalization_read_count <- idxstats %>%
filter(sample == s) %>%
filter(sequence_name == norm_seqname) %>%
pull(.data$mapped_reads)
cell_line <- sample_units %>%
filter(.data$sample_unit == s) %>%
pull(cell_line)
day <- sample_units %>%
filter(.data$sample_unit == s) %>%
pull(day)
which <- GRanges(sprintf("%s:1-%i", sequence_name, rna_mixes %>%
filter(sequence_name == sequence_name) %>%
pull(length)))
param_discordant <- ScanBamParam(
flag = scanBamFlag(isProperPair = FALSE),
which = which
)
discordant <- readGAlignments(sprintf("%s/%s.bam", bam_dir, s),
param = param_discordant
)
cov_discordant <- coverage(discordant)
cov_discordant_norm <- cov_discordant / normalization_read_count
which <- GRanges(sprintf("%s:1-%i", sequence_name, rna_mixes %>%
filter(sequence_name == sequence_name) %>%
pull(length)))
param_concordant <- ScanBamParam( # nolint
flag = scanBamFlag(isProperPair = TRUE),
which = which
)
df_complete <- bedpe_data %>%
filter(sample == s) %>%
filter(.data$chrom1 == sequence_name)
concordant <- GRanges(
ranges = IRanges(
start = df_complete$chrom1Start,
end = df_complete$chrom2End
),
seqnames = df_complete$chrom1
)
cov_concordant <- coverage(concordant)
cov_concordant_norm <- cov_concordant / normalization_read_count
plot_data[[s]] <- list(
cell_line = cell_line,
day = day,
discordant = cov_discordant_norm,
concordant = cov_concordant_norm
)
}
return(plot_data)
}
pegrna_plots <- function(sequence_name,
normalization_factor,
mix = NA,
ylim = NA,
ylab,
vlines = NA) {
plot_data <- get_pegrna_plot_data(
sequence_name = sequence_name,
normalization_factor = normalization_factor,
mix = mix
)
# Find largest value for rna_species
if (is.na(ylim)) {
m <- 0
for (s in names(plot_data)) {
m <- max(
m,
max(plot_data[[s]][["concordant"]][[sequence_name]]),
max(plot_data[[s]][["discordant"]][[sequence_name]])
)
}
ylim <- c(0, m)
}
last_day <- 0
par(mfrow = c(2, 1))
for (s in names(plot_data)) {
series_data_concordant <- plot_data[[s]][["concordant"]][[sequence_name]]
series_data_discordant <- plot_data[[s]][["discordant"]][[sequence_name]]
day <- plot_data[[s]][["day"]]
cell_line <- plot_data[[s]][["cell_line"]]
concordant_color <- concordant_cell_line_colors[[cell_line]]
discordant_color <- discordant_cell_line_colors[[cell_line]]
if (day != last_day) {
plot(series_data_concordant,
type = "l",
col = concordant_color,
ylim = ylim,
main = sprintf("Day %s", day),
xlab = sprintf("%s position", sequence_name),
ylab = ylab
)
lines(series_data_discordant,
type = "l",
col = discordant_cell_line_colors[[plot_data[[s]][["cell_line"]]]]
)
} else {
lines(series_data_concordant,
type = "l",
col = concordant_color
)
lines(series_data_discordant,
type = "l",
col = discordant_color
)
}
legend("top",
legend = c(
"Parental:Concordant",
"P1E10:Concordant",
"Parental:Discordant",
"P1E10:Discordant"
),
col = unlist(c(
concordant_cell_line_colors,
discordant_cell_line_colors
)),
lty = 1,
cex = 0.75
)
for (vline in vlines) {
abline(v = vline, lty = 2)
text(
x = vline, y = 0, as.character(vline),
pos = 2, cex = 0.75, lwd = 0.5
)
}
last_day <- day
}
}
rna_mix_rows <- rna_mixes %>% filter(grepl("VEGFA", rna_species))
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (row in seq_len(nrow(rna_mix_rows))) {
rna_species <- rna_mix_rows[[row, "rna_species"]]
mix <- rna_mix_rows[[row, "exogenous_rna"]]
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
mix = mix,
ylab = sprintf("%s coverage (normalized to %s)", mix, norm_factor),
)
}
}
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (rna_species in rna_mixes %>%
filter(grepl("FANCF", rna_species)) %>%
pull(rna_species)) {
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
)
}
}
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (rna_species in rna_mixes %>%
filter(grepl("HEK3", rna_species)) %>%
pull(rna_species)) {
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
)
}
}
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (rna_species in rna_mixes %>%
filter(grepl("DNMT1", rna_species)) %>%
pull(rna_species)) {
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
)
}
}
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (rna_species in rna_mixes %>%
filter(grepl("RUNX1", rna_species)) %>%
pull(rna_species)) {
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
)
}
}
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (rna_species in rna_mixes %>%
filter(grepl("EMX1", rna_species)) %>%
pull(rna_species)) {
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
)
}
}
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (rna_species in rna_mixes %>%
filter(grepl("RNF2", rna_species)) %>%
pull(rna_species)) {
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
)
}
}